import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
# Suppress warnings from pandas
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import metrics
from sklearn.metrics import precision_score ,recall_score, f1_score
from sklearn.metrics import accuracy_score
from plotly.offline import iplot, init_notebook_mode,download_plotlyjs
import plotly.graph_objects as go
import plotly.express as px
#use this code to be able to display all the output in the cell instead of only displaying the out put for the last one. see the next cell.
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
df_temp = pd.read_csv('global_temp_change.csv')
df_temp.head()
#df_temp.Area.unique()
df_temp.columns
df_temp.rename(columns={'Value':'temperature'}, inplace =True)
df_temp.columns
We only need the following features from the data. the rest are either constant or irrelevant
[ 'Area Code', 'Area','Months Code', 'Months', 'Year', 'Value']
df_temp = df_temp[[ 'Area Code', 'Area','Months Code', 'Months', 'Year', 'temperature']]
df_temp.head()
df_temp.isna().sum()
df_temp=df_temp.dropna()
#df_temp.Area.unique()
df_temp['pos_temp'] = df_temp.temperature.apply(lambda x: 0 if x < 0 else x)
# disc_data['Discount'] = disc_data['Discount'].apply(lambda x: 0 if (x < = 0) else x)
df_temp.head()
df_temp_metro = df_temp.query("Months == 'Meteorological year'")
df_temp_metro.head()
df_temp_top_20 = df_temp_metro.nlargest(20,['temperature'])
df_temp_top_20
fig = px.scatter(df_temp_top_20, x="Year", y="temperature", size='temperature' ,title='Highest Temperature Anomaly since 1960')
fig.show()
fig = px.choropleth(df_temp_metro,locations="Area", locationmode="country names",animation_frame="Year", animation_group="Area",
color="pos_temp",color_continuous_scale= 'reds' , hover_name="Area",)
fig.show()
# data = [ dict(
# type = 'choropleth',
# colorscale = 'Rainbow',
# locations = df_temp['Area'],
# z = df_temp['pos_temp'],
# text = df_temp['Area'],
# colorbar = dict(
# title = 'Years',
# titlefont=dict(size=25),
# tickfont=dict(size=18))
# ) ]
# # Define layout
# layout = dict(
# title = 'Life Expectancy at Birth',
# titlefont = dict(size=40),
# geo = dict(
# showframe = True,
# showcoastlines = True,
# projection = dict(type = 'equirectangular')
# )
# )
# # Plot
# fig = dict( data=data, layout=layout )
# #plot_url = plotly.offline.plot(fig, validate=False, filename='world.html')
# fig.show()
df_temp_usa = df_temp.query("Area=='United States of America'")
ts_temp = df_temp_usa[['Months','Year','temperature']]
ts_temp.head()
ts_temp.Months.unique()
# to take only the monthy data excluding the seasonal months
ts_temp_date= ts_temp[ts_temp['Months'].isin(['January', 'February', 'March', 'April', 'May', 'June', 'July','August', 'September', 'October', 'November', 'December'])]
ts_temp_date.Months.unique()
# change the month names to number
ts_temp_date['Months'] = pd.to_datetime(ts_temp_date.Months, format='%B').dt.month
# To combine the months and year to create a date
ts_temp_date['date'] = pd.to_datetime(ts_temp_date[['Year', 'Months']].assign(DAY=1))
ts_temp_date.head()
ts = ts_temp_date[['date', 'temperature']]
ts.set_index('date', inplace =True)
ts.sort_index(inplace =True)
import plotly.graph_objects as go
fig = px.line(ts,x=ts.index, y= ts.temperature, title='Time Series with Rangeslider').update_xaxes(rangeslider_visible=True)
# fig = px.line(ts, x='temperature', title='Time Series with Rangeslider')
fig.show()
ts.head()
ts.index
ts.plot(figsize =(16,6));
#ts.plot(figsize = (22,8), style = 'b.')
from statsmodels.tsa.seasonal import seasonal_decompose
decomposition = seasonal_decompose(ts ,freq=30)
# Gather the trend, seasonality and noise of decomposed object
trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid
# Plot gathered statistics
plt.figure(figsize=(12,8))
plt.subplot(411)
plt.plot(np.log(ts), label='Original', color="blue")
plt.legend(loc='best')
plt.subplot(412)
plt.plot(trend, label='Trend', color="blue")
plt.legend(loc='best')
plt.subplot(413)
plt.plot(seasonal,label='Seasonality', color="blue")
plt.legend(loc='best')
plt.subplot(414)
plt.plot(residual, label='Residuals', color="blue")
plt.legend(loc='best')
plt.tight_layout()
plt.show()
from statsmodels.tsa.stattools import adfuller
#Perform Dickey-Fuller test:
print ('Ho: The data is not stationary' '\n' 'H1: The data is Stationary' '\n\n')
print ('Results of Dickey-Fuller Test:')
dftest = adfuller(ts['temperature'])
# Extract and display test results in a user friendly manner
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
dfoutput['Critical Value (%s)'%key] = value
print (dfoutput)
if dfoutput['p-value'] < 0.05 :
print('p Value less than 0.05, we reject the null hypothesis and the data is Stationary' )
else:
print('p Value greater than 0.05, we fail to reject the null hypothesis and the data is not Stationary' )
from statsmodels.graphics.tsaplots import plot_pacf,plot_acf
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 14, 5
plot_pacf(ts, lags = 100);
plot_acf(ts, lags = 100);
pd.plotting.autocorrelation_plot(ts)
ts.shape
#divide into train and validation set
train = ts[:'2001-01-01']
test = ts['2001-01-01':]
#plotting the data
train.head()
test.head()
plt.plot(train)
plt.plot(test)
import pmdarima as pm
model = pm.auto_arima(train, trace=True,error_action='ignore', suppress_warnings=True)
model.fit(train)
print(model.summary())
model.plot_diagnostics(figsize=(8, 8))
plt.show()
forecast1 = model.predict(n_periods=len(test))
forecast1 = pd.DataFrame(forecast1,index = test.index,columns=['Prediction'])
#plot the predictions for test set
plt.plot(train, label='Train')
plt.plot(test, label='Test')
plt.plot(forecast1, label='Prediction')
plt.show();
test['predicted_value']=forecast1
test.head()
from sklearn.metrics import mean_squared_error
MSE = mean_squared_error(test.temperature, test.predicted_value)
print ('The mean squared error is: {}'.format(MSE))
ts.head()
ts_prof = ts.reset_index()
ts_prof.head()
ts_prof = ts_prof.rename(columns={'date':'ds','temperature': 'y'})
ts_prof.head()
ax = ts_prof.set_index('ds').plot(figsize=(15, 8))
ax.set_ylabel('Monthly Temperature Anomality ')
ax.set_xlabel('Date')
plt.show()
from fbprophet import Prophet as proph
Model = proph(interval_width=0.95)
Model.fit(ts_prof)
future_dates = Model.make_future_dataframe(periods=120, freq='MS')
future_dates.tail()
forecast = Model.predict(future_dates)
forecast.head()
ts_prof_forcast= forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']]
ts_prof_forcast.head()
Model.plot(forecast, uncertainty=True)
Model.plot_components(forecast)
plt.show()
import plotly.graph_objects as go
fig = px.line(ts_prof_forcast,x='ds', y= 'yhat', title='forcasted with Rangeslider').update_xaxes(rangeslider_visible=True)
# fig = px.line(ts, x='temperature', title='Time Series with Rangeslider')
fig.show()
ts_prof.head()
ts_prof_forcast.head()
metric_df = ts_prof_forcast.set_index('ds')[['yhat']].join(ts_prof.set_index('ds')[['y']]).reset_index()
metric_df.head()
The last part of the dataframe has “NaN” for ‘y' that is because we don't have the future true values and we can drop these “NaN” values.
metric_df.dropna(inplace=True)
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
mean_squared_error(metric_df.y, metric_df.yhat)
mean_absolute_error(metric_df.y, metric_df.yhat)